library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.4
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr) # Lirary for spliting train & test dataset
library(corrplot) # Plotting nice correlation matrix
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC) # For checking ROC Curve of the model
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(bestglm)
## Loading required package: leaps
library(brglm)
## Loading required package: profileModel
## 'brglm' will gradually be superseded by 'brglm2' (https://cran.r-project.org/package=brglm2), which provides utilities for mean and median bias reduction for all GLMs and methods for the detection of infinite estimates in binomial-response models.
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(gmodels)
##
## Attaching package: 'gmodels'
## The following object is masked from 'package:pROC':
##
## ci
library(mice)
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(nnet)
library(VIM)
## Loading required package: colorspace
##
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
##
## coords
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 3.0-2
setwd("/Users/haidu/Desktop/statistics/Categorical\ Data\ Analysis\ /project")
loan <- read.csv("loan.csv", header=T, na.strings=c("","NA"))
loan_data <- loan
attach(loan_data)
loan_data <- dplyr::select(loan_data, -Loan_ID) #Removing Loan_ID as it has no logical corelation
head(loan_data)
Gender <fctr> | Married <fctr> | Dependents <fctr> | Education <fctr> | Self_Employed <fctr> | ApplicantIncome <int> | CoapplicantIncome <dbl> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | Male | No | 0 | Graduate | No | 5849 | 0 | |
| 2 | Male | Yes | 1 | Graduate | No | 4583 | 1508 | |
| 3 | Male | Yes | 0 | Graduate | Yes | 3000 | 0 | |
| 4 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358 | |
| 5 | Male | No | 0 | Graduate | No | 6000 | 0 | |
| 6 | Male | Yes | 2 | Graduate | Yes | 5417 | 4196 |
sum(is.na(loan_data)) ## Checking for total missing values
## [1] 149
colSums(is.na(loan_data)) ## checking for any missing values in the feature
## Gender Married Dependents Education
## 13 3 15 0
## Self_Employed ApplicantIncome CoapplicantIncome LoanAmount
## 32 0 0 22
## Loan_Amount_Term Credit_History Property_Area Loan_Status
## 14 50 0 0
aggr(loan,prop=FALSE,numbers=TRUE)
ggplot(data=loan_data) +
geom_point(aes(x=LoanAmount, y=Loan_Status, color=Property_Area))
## Warning: Removed 22 rows containing missing values (geom_point).
ggplot(data=loan_data) +
geom_bar(aes(x=Loan_Status,color=Loan_Status))
ggplot(data=loan_data) +
geom_bar(aes(x=Loan_Amount_Term, fill=Loan_Status ))
## Warning: Removed 14 rows containing non-finite values (stat_count).
## Warning: position_stack requires non-overlapping x intervals
ggplot(data=loan_data) +
geom_histogram(aes(x=LoanAmount), bins = 50)
## Warning: Removed 22 rows containing non-finite values (stat_bin).
ggplot(data=loan_data) +
geom_histogram(aes(x=ApplicantIncome))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=loan_data) +
geom_histogram(aes(x=CoapplicantIncome ))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=loan_data) +
geom_bar(aes(x=Credit_History, fill=Loan_Status))
## Warning: Removed 50 rows containing non-finite values (stat_count).
ggplot(data=loan_data) +
geom_bar(aes(x=Dependents,fill=Loan_Status ))
ggplot(data=loan_data) +
geom_bar(aes(x=Education, fill=Loan_Status))
ggplot(data=loan_data) +
geom_bar(aes(x=Married, fill=Loan_Status))
#Making Contingency Table to check percentage of Credit_History in relation with loan status
CrossTable(loan_data$Loan_Status, loan_data$Credit_History,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 564
##
##
## | loan_data$Credit_History
## loan_data$Loan_Status | 0 | 1 | Row Total |
## ----------------------|-----------|-----------|-----------|
## N | 82 | 97 | 179 |
## | 0.458 | 0.542 | 0.317 |
## ----------------------|-----------|-----------|-----------|
## Y | 7 | 378 | 385 |
## | 0.018 | 0.982 | 0.683 |
## ----------------------|-----------|-----------|-----------|
## Column Total | 89 | 475 | 564 |
## ----------------------|-----------|-----------|-----------|
##
##
CrossTable(loan_data$Loan_Status, loan_data$Married,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
prop.chisq = FALSE )
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 611
##
##
## | loan_data$Married
## loan_data$Loan_Status | No | Yes | Row Total |
## ----------------------|-----------|-----------|-----------|
## N | 79 | 113 | 192 |
## | 0.411 | 0.589 | 0.314 |
## ----------------------|-----------|-----------|-----------|
## Y | 134 | 285 | 419 |
## | 0.320 | 0.680 | 0.686 |
## ----------------------|-----------|-----------|-----------|
## Column Total | 213 | 398 | 611 |
## ----------------------|-----------|-----------|-----------|
##
##
CrossTable(loan_data$Loan_Status, loan_data$Education,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
prop.chisq = FALSE )
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 614
##
##
## | loan_data$Education
## loan_data$Loan_Status | Graduate | Not Graduate | Row Total |
## ----------------------|--------------|--------------|--------------|
## N | 140 | 52 | 192 |
## | 0.729 | 0.271 | 0.313 |
## ----------------------|--------------|--------------|--------------|
## Y | 340 | 82 | 422 |
## | 0.806 | 0.194 | 0.687 |
## ----------------------|--------------|--------------|--------------|
## Column Total | 480 | 134 | 614 |
## ----------------------|--------------|--------------|--------------|
##
##
CrossTable(loan_data$Loan_Status, loan_data$Self_Employed,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
prop.chisq = FALSE )
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 582
##
##
## | loan_data$Self_Employed
## loan_data$Loan_Status | No | Yes | Row Total |
## ----------------------|-----------|-----------|-----------|
## N | 157 | 26 | 183 |
## | 0.858 | 0.142 | 0.314 |
## ----------------------|-----------|-----------|-----------|
## Y | 343 | 56 | 399 |
## | 0.860 | 0.140 | 0.686 |
## ----------------------|-----------|-----------|-----------|
## Column Total | 500 | 82 | 582 |
## ----------------------|-----------|-----------|-----------|
##
##
CrossTable(loan_data$Loan_Status, loan_data$Property_Area,prop.r = TRUE, prop.c = FALSE, prop.t = FALSE,
prop.chisq = FALSE )
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 614
##
##
## | loan_data$Property_Area
## loan_data$Loan_Status | Rural | Semiurban | Urban | Row Total |
## ----------------------|-----------|-----------|-----------|-----------|
## N | 69 | 54 | 69 | 192 |
## | 0.359 | 0.281 | 0.359 | 0.313 |
## ----------------------|-----------|-----------|-----------|-----------|
## Y | 110 | 179 | 133 | 422 |
## | 0.261 | 0.424 | 0.315 | 0.687 |
## ----------------------|-----------|-----------|-----------|-----------|
## Column Total | 179 | 233 | 202 | 614 |
## ----------------------|-----------|-----------|-----------|-----------|
##
##
inputs the mean into the missing values, similary for categorical variable, we can use the category that apperars the most frequently
loan_data <- loan_data %>%
mutate(LoanAmount=ifelse(is.na(LoanAmount), mean(LoanAmount, na.rm = T), LoanAmount),
Loan_Amount_Term=ifelse(is.na(Loan_Amount_Term), median(Loan_Amount_Term, na.rm = T), Loan_Amount_Term),
Credit_History=ifelse(is.na(Credit_History), 1, Credit_History))
(Replace outlier with lower and upper cutoff value) using the rule of thumb where upper limit is computed as 1.5 * IRQ, where IRQ = 3rd Quartile – 1st Quartile.
#scatter plot to detect outliers for ApplicantIncome
plot(ApplicantIncome, ylab = "ApplicantIncome")
outliers_upperlimit_AppIncome <- quantile(ApplicantIncome, 0.75) + 1.5 * IQR(ApplicantIncome) # upper_limit = 10171.25
index.outliers.ApplicantIncome <- which(ApplicantIncome > outliers_upperlimit_AppIncome | ApplicantIncome < 0 ) # 50 outliers
loan_data <- loan_data[-index.outliers.ApplicantIncome,] #Removing observations
plot(loan_data$ApplicantIncome, ylab = "ApplicantIncome")
# detect outliers for CoapplicantIncome
plot(loan_data$CoapplicantIncome, ylab = "CoapplicantIncome")
outliers_upperlimit_CoIncome <- quantile(loan_data$CoapplicantIncome, 0.75) + 1.5 * IQR(loan_data$CoapplicantIncome)
index.outliers.CoIncome <- which(loan_data$CoapplicantIncome > outliers_upperlimit_CoIncome | loan_data$CoapplicantIncome < 0 )
loan_data <- loan_data[-index.outliers.CoIncome,] #Removing observations
plot(loan_data$CoapplicantIncome, ylab = "CoapplicantIncome")
# Treatment of outlier for LoanAmount
plot(loan_data$LoanAmount, ylab = "LoanAmount")
outliers_upperlimit_LoanAmount <- quantile(loan_data$LoanAmount, 0.75) + 1.5 * IQR(loan_data$LoanAmount)
index.outliers.LoanAmount <- which(loan_data$LoanAmount > outliers_upperlimit_LoanAmount | loan_data$LoanAmount < 0 )
loan_data <- loan_data[-index.outliers.LoanAmount,] #Removing observations
plot(loan_data$LoanAmount, ylab = "LoanAmount")
create dummy variables for categorical attributes
#I converted the Dependents variable to a continuous variable in order
loan_data$Dependents=as.numeric(substr(loan_data$Dependents,1,1))
loan_data <- loan_data %>%
mutate(Gender=ifelse(Gender=="Male",1,0),
Married=ifelse(Married=="Yes",1,0),
Education=ifelse(Education=="Graduate",1,0),
Self_Employed=ifelse(Self_Employed=="Yes",1,0),
Loan_Status=ifelse(Loan_Status=="Y",1,0))
#deal with missing value again, for catergorical data, use the category that appears most frequently
loan_data <- loan_data %>%
mutate(Gender=ifelse(is.na(Gender),1,Gender),
Married=ifelse(is.na(Married),1,Married),
Dependents=ifelse(is.na(Dependents),0,Dependents),
Self_Employed=ifelse(is.na(Self_Employed),0,Self_Employed))
# More than 2 unique values treatment
loan_data$Urban <- ifelse(loan_data$Property_Area=="Urban",1,0)
loan_data$Rural <- ifelse(loan_data$Property_Area=="Rural",1,0)
loan_data$Semiurban <-ifelse(loan_data$Property_Area=="Semiurban",1,0)
#check for correlation between the variables exploratory data analysis is to check correlations among all variables
cor(loan_data[, sapply(loan_data, class) != "factor" ],) #Checking multicollinearity
## Gender Married Dependents Education
## Gender 1.000000000 0.366896175 0.197637376 -0.077147385
## Married 0.366896175 1.000000000 0.347494857 -0.031658148
## Dependents 0.197637376 0.347494857 1.000000000 -0.091162626
## Education -0.077147385 -0.031658148 -0.091162626 1.000000000
## Self_Employed 0.018360619 0.011510026 0.044635456 -0.002700106
## ApplicantIncome 0.058170211 -0.011708393 0.106643913 0.133652845
## CoapplicantIncome 0.198662627 0.257122438 -0.050769669 0.026163783
## LoanAmount 0.144713240 0.195775280 0.094648465 0.087518158
## Loan_Amount_Term -0.081534683 -0.091292772 -0.097422737 0.070136775
## Credit_History 0.005464585 0.008026898 -0.030432194 0.083186603
## Loan_Status 0.051440973 0.083969054 -0.002540811 0.116144180
## Urban 0.045120925 -0.004504906 0.005676846 0.022884170
## Rural 0.074089533 -0.017468340 -0.036652822 -0.080294161
## Semiurban -0.113276575 0.020836088 0.029235952 0.054044306
## Self_Employed ApplicantIncome CoapplicantIncome
## Gender 0.018360619 0.058170211 0.198662627
## Married 0.011510026 -0.011708393 0.257122438
## Dependents 0.044635456 0.106643913 -0.050769669
## Education -0.002700106 0.133652845 0.026163783
## Self_Employed 1.000000000 0.179528002 -0.045701089
## ApplicantIncome 0.179528002 1.000000000 -0.270461490
## CoapplicantIncome -0.045701089 -0.270461490 1.000000000
## LoanAmount 0.090048388 0.436923090 0.310754050
## Loan_Amount_Term -0.066294038 -0.074934591 -0.010446208
## Credit_History 0.029057610 0.053453818 0.001293975
## Loan_Status -0.013524023 0.019987144 0.065063190
## Urban -0.073516055 -0.088117010 -0.053587233
## Rural 0.041375963 0.096143868 0.085539168
## Semiurban 0.031238872 -0.006589329 -0.029613571
## LoanAmount Loan_Amount_Term Credit_History
## Gender 0.144713240 -0.08153468 0.005464585
## Married 0.195775280 -0.09129277 0.008026898
## Dependents 0.094648465 -0.09742274 -0.030432194
## Education 0.087518158 0.07013678 0.083186603
## Self_Employed 0.090048388 -0.06629404 0.029057610
## ApplicantIncome 0.436923090 -0.07493459 0.053453818
## CoapplicantIncome 0.310754050 -0.01044621 0.001293975
## LoanAmount 1.000000000 0.07692013 -0.005444425
## Loan_Amount_Term 0.076920126 1.00000000 -0.018932317
## Credit_History -0.005444425 -0.01893232 1.000000000
## Loan_Status -0.021883151 -0.02571920 0.569831570
## Urban -0.147243718 -0.10615095 -0.007679229
## Rural 0.109511523 0.03674422 -0.031672381
## Semiurban 0.037368052 0.06686150 0.037310664
## Loan_Status Urban Rural Semiurban
## Gender 0.051440973 0.045120925 0.07408953 -0.113276575
## Married 0.083969054 -0.004504906 -0.01746834 0.020836088
## Dependents -0.002540811 0.005676846 -0.03665282 0.029235952
## Education 0.116144180 0.022884170 -0.08029416 0.054044306
## Self_Employed -0.013524023 -0.073516055 0.04137596 0.031238872
## ApplicantIncome 0.019987144 -0.088117010 0.09614387 -0.006589329
## CoapplicantIncome 0.065063190 -0.053587233 0.08553917 -0.029613571
## LoanAmount -0.021883151 -0.147243718 0.10951152 0.037368052
## Loan_Amount_Term -0.025719205 -0.106150948 0.03674422 0.066861503
## Credit_History 0.569831570 -0.007679229 -0.03167238 0.037310664
## Loan_Status 1.000000000 -0.042447024 -0.10757726 0.142393338
## Urban -0.042447024 1.000000000 -0.44790480 -0.533621281
## Rural -0.107577256 -0.447904799 1.00000000 -0.517134043
## Semiurban 0.142393338 -0.533621281 -0.51713404 1.000000000
# graph check multicollinearity
correlation <- loan_data[sapply(loan_data, is.numeric)]
descrCorr <- cor(correlation)
corrplot(descrCorr)
pairs.panels(loan_data)
# let's take a look initial model
head(loan_data)
Gender <dbl> | Married <dbl> | Dependents <dbl> | Education <dbl> | Self_Employed <dbl> | ApplicantIncome <int> | CoapplicantIncome <dbl> | LoanAmount <dbl> | ||
|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 0 | 0 | 1 | 0 | 5849 | 0 | 146.4122 | |
| 2 | 1 | 1 | 1 | 1 | 0 | 4583 | 1508 | 128.0000 | |
| 3 | 1 | 1 | 0 | 1 | 1 | 3000 | 0 | 66.0000 | |
| 4 | 1 | 1 | 0 | 0 | 0 | 2583 | 2358 | 120.0000 | |
| 5 | 1 | 0 | 0 | 1 | 0 | 6000 | 0 | 141.0000 | |
| 6 | 1 | 1 | 0 | 0 | 0 | 2333 | 1516 | 95.0000 |
loan_data_R <- dplyr::select(loan_data, -Property_Area)
reg1 <- glm(Loan_Status~., family = binomial, data = loan_data_R)
summary(reg1)
##
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2938 -0.3139 0.5005 0.6809 2.6244
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.541e+00 9.550e-01 -2.660 0.007806 **
## Gender 3.093e-01 3.257e-01 0.950 0.342319
## Married 4.423e-01 2.920e-01 1.515 0.129762
## Dependents -2.877e-02 1.366e-01 -0.211 0.833221
## Education 4.837e-01 2.740e-01 1.766 0.077460 .
## Self_Employed -3.218e-01 3.669e-01 -0.877 0.380433
## ApplicantIncome 1.019e-04 8.949e-05 1.139 0.254630
## CoapplicantIncome 2.072e-04 1.152e-04 1.799 0.072011 .
## LoanAmount -7.525e-03 4.023e-03 -1.871 0.061383 .
## Loan_Amount_Term -4.842e-04 1.937e-03 -0.250 0.802634
## Credit_History 4.215e+00 4.910e-01 8.585 < 2e-16 ***
## Urban -8.709e-01 3.097e-01 -2.812 0.004920 **
## Rural -1.012e+00 3.054e-01 -3.315 0.000917 ***
## Semiurban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 449.00 on 509 degrees of freedom
## AIC: 475
##
## Number of Fisher Scoring iterations: 5
cor(loan_data_R$Loan_Status, fitted(reg1)) #0.60
## [1] 0.6014094
hoslem.test(loan_data_R$Loan_Status, fitted(reg1)) #p-value = 0.4663
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: loan_data_R$Loan_Status, fitted(reg1)
## X-squared = 7.6703, df = 8, p-value = 0.4663
####################################################################################
#mice treating missing valued by using the loan data = origional data
loan <- dplyr::select(loan,-Loan_ID)
aggr(loan,prop=FALSE,numbers=TRUE)
aggr(loan,prop=TRUE,numbers=TRUE)
#Multiple Imputation
imps = mice(loan)
##
## iter imp variable
## 1 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
library(lattice)
densityplot(imps)
#What we would like to see is that the shape of the magenta points (imputed) matches the shape of the blue ones (observed). The matching shape tells us that the imputed values are indeed “plausible values”.The density of the imputed data for each imputed dataset is showed in magenta while the density of the observed data is showed in blue. Again, under our previous assumptions we expect the distributions to be similar.
#mice reg model
fit_mcie = with(imps, glm(loan$Loan_Status~., family = binomial, data = loan))
Final_micefit <- pool(fit_mcie)
summary(Final_micefit)
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.429153e+00 | 9.311516e-01 | -2.6087617 | 462.966 | 9.380985e-03 |
| GenderMale | 3.253805e-01 | 3.309205e-01 | 0.9832590 | 462.966 | 3.259935e-01 |
| MarriedYes | 5.738867e-01 | 2.924218e-01 | 1.9625307 | 462.966 | 5.029919e-02 |
| Dependents1 | -3.756212e-01 | 3.460433e-01 | -1.0854745 | 462.966 | 2.782764e-01 |
| Dependents2 | 2.770496e-01 | 3.781582e-01 | 0.7326289 | 462.966 | 4.641556e-01 |
| Dependents3+ | 1.883805e-01 | 4.874361e-01 | 0.3864722 | 462.966 | 6.993246e-01 |
| EducationNot Graduate | -4.209972e-01 | 3.032836e-01 | -1.3881303 | 462.966 | 1.657650e-01 |
| Self_EmployedYes | -1.491721e-01 | 3.523418e-01 | -0.4233733 | 462.966 | 6.722195e-01 |
| ApplicantIncome | 6.945049e-06 | 2.862075e-05 | 0.2426578 | 462.966 | 8.083780e-01 |
| CoapplicantIncome | -5.142946e-05 | 4.307183e-05 | -1.1940393 | 462.966 | 2.330740e-01 |
# porpuslful selection
#step1
summary(pool(with(imps, glm(loan$Loan_Status~Gender, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.7065702 | 0.2008988 | 3.5170447 | 596.9494 | 0.0004695274 |
| GenderMale | 0.1087946 | 0.2235550 | 0.4866569 | 596.9494 | 0.6266800834 |
summary(pool(with(imps, glm(loan$Loan_Status~Married, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.5283919 | 0.1418482 | 3.725052 | 606.9482 | 0.0002134864 |
| MarriedYes | 0.3967094 | 0.1802199 | 2.201252 | 606.9482 | 0.0280938792 |
summary(pool(with(imps, glm(loan$Loan_Status~Dependents, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.7994418 | 0.1163936 | 6.8684360 | 592.9499 | 1.643929e-11 |
| Dependents1 | -0.1933060 | 0.2376484 | -0.8134120 | 592.9499 | 4.163084e-01 |
| Dependents2 | 0.3124157 | 0.2582736 | 1.2096306 | 592.9499 | 2.269027e-01 |
| Dependents3+ | -0.1933060 | 0.3152872 | -0.6131109 | 592.9499 | 5.400381e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Education, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.8873032 | 0.1004192 | 8.835989 | 609.9478 | 0.00000000 |
| EducationNot Graduate | -0.4318277 | 0.2037398 | -2.119506 | 609.9478 | 0.03445109 |
summary(pool(with(imps, glm(loan$Loan_Status~Self_Employed, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.78148464 | 0.09635804 | 8.11021709 | 577.9517 | 3.108624e-15 |
| Self_EmployedYes | -0.01422949 | 0.25613184 | -0.05555533 | 577.9517 | 9.557152e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~ApplicantIncome, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 7.964151e-01 | 1.158793e-01 | 6.8727999 | 609.9478 | 1.558997e-11 |
| ApplicantIncome | -1.644623e-06 | 1.409642e-05 | -0.1166696 | 609.9478 | 9.071603e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~CoapplicantIncome, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 8.551532e-01 | 0.1000224757 | 8.549611 | 609.9478 | 0.0000000 |
| CoapplicantIncome | -4.065626e-05 | 0.0000291521 | -1.394626 | 609.9478 | 0.1636364 |
summary(pool(with(imps, glm(loan$Loan_Status~LoanAmount, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.9551142970 | 0.174832448 | 5.4630265 | 587.9505 | 6.919104e-08 |
| LoanAmount | -0.0009145355 | 0.001010431 | -0.9050949 | 587.9505 | 3.657858e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Loan_Amount_Term, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 1.0477281261 | 0.484632654 | 2.1619016 | 595.9495 | 0.03102309 |
| Loan_Amount_Term | -0.0007227429 | 0.001388447 | -0.5205404 | 595.9495 | 0.60288028 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.460809 | 0.3937666 | -6.249410 | 559.9538 | 8.167031e-10 |
| Credit_History | 3.820992 | 0.4098865 | 9.322075 | 559.9538 | 0.000000e+00 |
summary(pool(with(imps, glm(loan$Loan_Status~Property_Area, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | 0.4663739 | 0.1535697 | 3.0368866 | 608.9479 | 0.0024925769 |
| Property_AreaSemiurban | 0.7320279 | 0.2183775 | 3.3521213 | 608.9479 | 0.0008516376 |
| Property_AreaUrban | 0.1898688 | 0.2135303 | 0.8891887 | 608.9479 | 0.3742528040 |
#we find that Married, Education, CoapplicantIncome, and Credit_History have p-values less than 0.2.
summary(pool(with(imps, glm(loan$Loan_Status~Married+Education+CoapplicantIncome
+Credit_History, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.652940e+00 | 4.338735e-01 | -6.114547 | 553.9546 | 1.829263e-09 |
| MarriedYes | 5.437148e-01 | 2.260484e-01 | 2.405302 | 553.9546 | 1.648597e-02 |
| EducationNot Graduate | -4.025707e-01 | 2.600842e-01 | -1.547848 | 553.9546 | 1.222300e-01 |
| CoapplicantIncome | -4.696803e-05 | 3.947181e-05 | -1.189913 | 553.9546 | 2.345902e-01 |
| Credit_History | 3.835977e+00 | 4.125966e-01 | 9.297161 | 553.9546 | 0.000000e+00 |
#step2
summary(pool(with(imps, glm(loan$Loan_Status~Married+Education+Credit_History, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.7134493 | 0.4317363 | -6.284969 | 554.9544 | 6.638152e-10 |
| MarriedYes | 0.5179787 | 0.2246489 | 2.305725 | 554.9544 | 2.149379e-02 |
| EducationNot Graduate | -0.3784572 | 0.2587755 | -1.462493 | 554.9544 | 1.441723e-01 |
| Credit_History | 3.8323938 | 0.4124852 | 9.290985 | 554.9544 | 0.000000e+00 |
summary(pool(with(imps, glm(loan$Loan_Status~Married+Credit_History, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.8194196 | 0.4276450 | -6.592897 | 555.9543 | 1.004119e-10 |
| MarriedYes | 0.5204595 | 0.2243575 | 2.319777 | 555.9543 | 2.071421e-02 |
| Credit_History | 3.8535614 | 0.4123719 | 9.344870 | 555.9543 | 0.000000e+00 |
#step3
summary(pool(with(imps, glm(loan$Loan_Status~Married+Credit_History
+Dependents, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.7812520 | 0.4341829 | -6.4057156 | 540.9562 | 3.255318e-10 |
| MarriedYes | 0.4961788 | 0.2457752 | 2.0188316 | 540.9562 | 4.399756e-02 |
| Credit_History | 3.8050633 | 0.4157340 | 9.1526380 | 540.9562 | 0.000000e+00 |
| Dependents1 | -0.1576165 | 0.3113703 | -0.5062029 | 540.9562 | 6.129204e-01 |
| Dependents2 | 0.2211681 | 0.3446544 | 0.6417098 | 540.9562 | 5.213335e-01 |
| Dependents3+ | 0.1704795 | 0.4417588 | 0.3859108 | 540.9562 | 6.997145e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+Education, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.7134493 | 0.4317363 | -6.284969 | 554.9544 | 6.638152e-10 |
| Credit_History | 3.8323938 | 0.4124852 | 9.290985 | 554.9544 | 0.000000e+00 |
| MarriedYes | 0.5179787 | 0.2246489 | 2.305725 | 554.9544 | 2.149379e-02 |
| EducationNot Graduate | -0.3784572 | 0.2587755 | -1.462493 | 554.9544 | 1.441723e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+Self_Employed, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.80068130 | 0.4305185 | -6.5053683 | 528.9576 | 1.798754e-10 |
| Credit_History | 3.80846796 | 0.4131399 | 9.2183489 | 528.9576 | 0.000000e+00 |
| MarriedYes | 0.53105029 | 0.2290141 | 2.3188540 | 528.9576 | 2.078313e-02 |
| Self_EmployedYes | -0.09129537 | 0.3184625 | -0.2866754 | 528.9576 | 7.744731e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+ApplicantIncome, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.816835e+00 | 4.389002e-01 | -6.41794023 | 554.9544 | 2.965130e-10 |
| Credit_History | 3.853415e+00 | 4.124028e-01 | 9.34381434 | 554.9544 | 0.000000e+00 |
| MarriedYes | 5.207964e-01 | 2.247314e-01 | 2.31741693 | 554.9544 | 2.084395e-02 |
| ApplicantIncome | -4.892081e-07 | 1.877092e-05 | -0.02606203 | 554.9544 | 9.792172e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+CoapplicantIncome, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.769323e+00 | 0.4295733695 | -6.446682 | 554.9544 | 2.486571e-10 |
| Credit_History | 3.856305e+00 | 0.4124010717 | 9.350862 | 554.9544 | 0.000000e+00 |
| MarriedYes | 5.435283e-01 | 0.2256928631 | 2.408265 | 554.9544 | 1.635357e-02 |
| CoapplicantIncome | -4.221221e-05 | 0.0000391437 | -1.078391 | 554.9544 | 2.813279e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+LoanAmount, family = binomial, data = loan)))) # only Loan_amount change Marries by at 10%
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.593209218 | 0.456561772 | -5.679865 | 534.9569 | 2.214888e-08 |
| Credit_History | 3.851287969 | 0.415002869 | 9.280148 | 534.9569 | 0.000000e+00 |
| MarriedYes | 0.597690646 | 0.234435021 | 2.549494 | 534.9569 | 1.106581e-02 |
| LoanAmount | -0.001578425 | 0.001302304 | -1.212026 | 534.9569 | 2.260376e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+Loan_Amount_Term, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.5381265295 | 0.750962937 | -3.3798293 | 540.9562 | 0.0007776295 |
| Credit_History | 3.7572039012 | 0.413835209 | 9.0789856 | 540.9562 | 0.0000000000 |
| MarriedYes | 0.5179037987 | 0.226065715 | 2.2909436 | 540.9562 | 0.0223503389 |
| Loan_Amount_Term | -0.0005979645 | 0.001780349 | -0.3358692 | 540.9562 | 0.7370997362 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+Credit_History, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.8194196 | 0.4276450 | -6.592897 | 555.9543 | 1.004119e-10 |
| Credit_History | 3.8535614 | 0.4123719 | 9.344870 | 555.9543 | 0.000000e+00 |
| MarriedYes | 0.5204595 | 0.2243575 | 2.319777 | 555.9543 | 2.071421e-02 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+Property_Area, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -3.3106642 | 0.4700045 | -7.0438985 | 553.9546 | 5.567324e-12 |
| Credit_History | 3.9412402 | 0.4201451 | 9.3806634 | 553.9546 | 0.000000e+00 |
| MarriedYes | 0.5235012 | 0.2275979 | 2.3001146 | 553.9546 | 2.181253e-02 |
| Property_AreaSemiurban | 1.0170221 | 0.2826172 | 3.5985854 | 553.9546 | 3.486179e-04 |
| Property_AreaUrban | 0.1983775 | 0.2642820 | 0.7506281 | 553.9546 | 4.531952e-01 |
#step4:Attempt adding plausible interactions among variables in the model, usually using somewhat stricter standards such a p-value<0.05 (can consider non-linear predictor terms, like quadratic effects, in this step as well).
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+LoanAmount + Credit_History:Married, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.038889264 | 0.628114204 | -3.246049 | 533.957 | 1.243569e-03 |
| Credit_History | 3.260244303 | 0.633266135 | 5.148300 | 533.957 | 3.701468e-07 |
| MarriedYes | -0.241660259 | 0.800493627 | -0.301889 | 533.957 | 7.628542e-01 |
| LoanAmount | -0.001591522 | 0.001312593 | -1.212502 | 533.957 | 2.258563e-01 |
| Credit_History:MarriedYes | 0.907892569 | 0.834453362 | 1.088009 | 533.957 | 2.770821e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+LoanAmount + Married:LoanAmount, family = binomial, data = loan))))
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -2.429748216 | 0.535116244 | -4.5405989 | 533.957 | 6.938592e-06 |
| Credit_History | 3.850970096 | 0.414692624 | 9.2863241 | 533.957 | 0.000000e+00 |
| MarriedYes | 0.358863596 | 0.470307786 | 0.7630399 | 533.957 | 4.457766e-01 |
| LoanAmount | -0.002832526 | 0.002490300 | -1.1374237 | 533.957 | 2.558714e-01 |
| MarriedYes:LoanAmount | 0.001724208 | 0.002933497 | 0.5877654 | 533.957 | 5.569382e-01 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+LoanAmount + Credit_History:LoanAmount,
family = binomial, data = loan)))) # Credit_History:LoanAmount p = 0.034
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -3.661102027 | 0.734296000 | -4.985867 | 533.957 | 8.352020e-07 |
| Credit_History | 5.032270384 | 0.755641236 | 6.659603 | 533.957 | 6.837664e-11 |
| MarriedYes | 0.603371702 | 0.236631788 | 2.549834 | 533.957 | 1.105569e-02 |
| LoanAmount | 0.004929671 | 0.003181245 | 1.549605 | 533.957 | 1.218289e-01 |
| Credit_History:LoanAmount | -0.007282560 | 0.003425296 | -2.126111 | 533.957 | 3.395215e-02 |
summary(pool(with(imps, glm(loan$Loan_Status~Credit_History+Married
+LoanAmount+Credit_History:LoanAmount+I(LoanAmount^2), family = binomial, data = loan)))) # Credit_History:LoanAmount p = 0.046
term <fctr> | estimate <dbl> | std.error <dbl> | statistic <dbl> | df <dbl> | p.value <dbl> |
|---|---|---|---|---|---|
| (Intercept) | -3.464251e+00 | 8.361211e-01 | -4.1432409 | 532.9571 | 3.981945e-05 |
| Credit_History | 4.984143e+00 | 7.583655e-01 | 6.5722168 | 532.9571 | 1.182234e-10 |
| MarriedYes | 6.132121e-01 | 2.375298e-01 | 2.5816220 | 532.9571 | 1.009963e-02 |
| LoanAmount | 2.972242e-03 | 5.181382e-03 | 0.5736388 | 532.9571 | 5.664542e-01 |
| I(LoanAmount^2) | 3.132992e-06 | 6.551381e-06 | 0.4782185 | 532.9571 | 6.326908e-01 |
| Credit_History:LoanAmount | -6.989972e-03 | 3.490626e-03 | -2.0024982 | 532.9571 | 4.573727e-02 |
# Conclusion:model included: Credit_History, Married, LoanAmount, Credit_History*LoanAmount
########################################################################
#complete the missing value by using final model predictor see what's going on
one_of_dataset <- complete(imps,1)
model1 <-glm(Loan_Status~Credit_History+Married+LoanAmount+
Credit_History:LoanAmount, family = binomial, data = one_of_dataset)
rocplotone11 <- roc(one_of_dataset$Loan_Status ~ fitted(model1), data = one_of_dataset)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot.roc(rocplotone11, print.auc = TRUE)
auc(rocplotone11) #0.78 Auc depdent on different dataset
## Area under the curve: 0.7704
#using different complete dataset and stepAIC() see what's going on,
stepAIC(glm(Loan_Status~., family = binomial, data = one_of_dataset))
## Start: AIC=575.12
## Loan_Status ~ Gender + Married + Dependents + Education + Self_Employed +
## ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Property_Area
##
## Df Deviance AIC
## - Dependents 3 547.68 571.68
## - Self_Employed 1 545.12 573.12
## - Gender 1 545.13 573.13
## - Loan_Amount_Term 1 545.38 573.38
## - ApplicantIncome 1 545.48 573.48
## - CoapplicantIncome 1 545.87 573.87
## - LoanAmount 1 547.00 575.00
## <none> 545.12 575.12
## - Education 1 547.69 575.69
## - Married 1 551.09 579.09
## - Property_Area 2 558.46 584.46
## - Credit_History 1 733.48 761.48
##
## Step: AIC=571.68
## Loan_Status ~ Gender + Married + Education + Self_Employed +
## ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Property_Area
##
## Df Deviance AIC
## - Self_Employed 1 547.68 569.68
## - Gender 1 547.72 569.72
## - Loan_Amount_Term 1 547.83 569.83
## - ApplicantIncome 1 548.06 570.06
## - CoapplicantIncome 1 548.28 570.28
## <none> 547.68 571.68
## - LoanAmount 1 549.70 571.70
## - Education 1 550.24 572.24
## - Married 1 554.09 576.09
## - Property_Area 2 560.50 580.50
## - Credit_History 1 736.94 758.94
##
## Step: AIC=569.68
## Loan_Status ~ Gender + Married + Education + ApplicantIncome +
## CoapplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History +
## Property_Area
##
## Df Deviance AIC
## - Gender 1 547.72 567.72
## - Loan_Amount_Term 1 547.83 567.83
## - ApplicantIncome 1 548.06 568.06
## - CoapplicantIncome 1 548.28 568.28
## <none> 547.68 569.68
## - LoanAmount 1 549.71 569.71
## - Education 1 550.25 570.25
## - Married 1 554.09 574.09
## - Property_Area 2 560.50 578.50
## - Credit_History 1 737.00 757.00
##
## Step: AIC=567.72
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome +
## LoanAmount + Loan_Amount_Term + Credit_History + Property_Area
##
## Df Deviance AIC
## - Loan_Amount_Term 1 547.88 565.88
## - ApplicantIncome 1 548.10 566.10
## - CoapplicantIncome 1 548.29 566.29
## <none> 547.72 567.72
## - LoanAmount 1 549.72 567.72
## - Education 1 550.26 568.26
## - Married 1 555.37 573.37
## - Property_Area 2 560.54 576.54
## - Credit_History 1 737.13 755.13
##
## Step: AIC=565.88
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome +
## LoanAmount + Credit_History + Property_Area
##
## Df Deviance AIC
## - ApplicantIncome 1 548.33 564.33
## - CoapplicantIncome 1 548.42 564.42
## <none> 547.88 565.88
## - LoanAmount 1 550.06 566.06
## - Education 1 550.32 566.32
## - Married 1 555.95 571.95
## - Property_Area 2 560.66 574.66
## - Credit_History 1 737.27 753.27
##
## Step: AIC=564.33
## Loan_Status ~ Married + Education + CoapplicantIncome + LoanAmount +
## Credit_History + Property_Area
##
## Df Deviance AIC
## - CoapplicantIncome 1 549.21 563.21
## - LoanAmount 1 550.18 564.18
## <none> 548.33 564.33
## - Education 1 550.84 564.84
## - Married 1 556.29 570.29
## - Property_Area 2 561.06 573.06
## - Credit_History 1 737.29 751.29
##
## Step: AIC=563.21
## Loan_Status ~ Married + Education + LoanAmount + Credit_History +
## Property_Area
##
## Df Deviance AIC
## <none> 549.21 563.21
## - LoanAmount 1 551.32 563.32
## - Education 1 551.53 563.53
## - Married 1 556.81 568.81
## - Property_Area 2 562.08 572.08
## - Credit_History 1 739.34 751.34
##
## Call: glm(formula = Loan_Status ~ Married + Education + LoanAmount +
## Credit_History + Property_Area, family = binomial, data = one_of_dataset)
##
## Coefficients:
## (Intercept) MarriedYes EducationNot Graduate
## -3.01061 0.61467 -0.39980
## LoanAmount Credit_History Property_AreaSemiurban
## -0.00172 4.00501 0.89510
## Property_AreaUrban
## 0.15809
##
## Degrees of Freedom: 613 Total (i.e. Null); 607 Residual
## Null Deviance: 762.9
## Residual Deviance: 549.2 AIC: 563.2
#run the smallest AIC
aci_model<-glm(formula = Loan_Status ~ Married + Credit_History + Property_Area,
family = binomial, data = one_of_dataset)
#ROC
rocplotone <- roc(one_of_dataset$Loan_Status ~ fitted(aci_model), data = one_of_dataset)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot.roc(rocplotone, print.auc = TRUE)
auc(rocplotone)
## Area under the curve: 0.7904
#Purposeful Model Selection
##step1 Fit “simple” logistic regression models for each of the predictors separately.
#Eliminate any predictor values with large p-values (say >0.2).
summary(glm(Loan_Status ~ Gender, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Gender, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5666 -1.4421 0.8330 0.8330 0.9341
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6035 0.2102 2.871 0.00409 **
## Gender 0.2766 0.2358 1.173 0.24075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 640.40 on 520 degrees of freedom
## AIC: 644.4
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Married, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6047 -1.4358 0.8036 0.8036 0.9394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5895 0.1535 3.841 0.000123 ***
## Married 0.3751 0.1960 1.914 0.055617 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 638.12 on 520 degrees of freedom
## AIC: 642.12
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Dependents, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Dependents, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5437 -1.5387 0.8510 0.8529 0.8569
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.829448 0.116986 7.090 1.34e-12 ***
## Dependents -0.005578 0.096089 -0.058 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 641.75 on 520 degrees of freedom
## AIC: 645.75
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Education, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Education, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6099 -1.3594 0.7997 0.7997 1.0057
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4182 0.1786 2.342 0.01919 *
## Education 0.5579 0.2116 2.637 0.00836 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 634.93 on 520 degrees of freedom
## AIC: 638.93
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Self_Employed, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Self_Employed, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5467 -1.5467 0.8486 0.8486 0.8817
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.83601 0.10119 8.262 <2e-16 ***
## Self_Employed -0.09157 0.29643 -0.309 0.757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 641.66 on 520 degrees of freedom
## AIC: 645.66
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ ApplicantIncome, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ ApplicantIncome, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6084 -1.5253 0.8464 0.8599 0.8857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.284e-01 2.323e-01 3.136 0.00171 **
## ApplicantIncome 2.445e-05 5.357e-05 0.457 0.64802
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 641.55 on 520 degrees of freedom
## AIC: 645.55
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ CoapplicantIncome, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ CoapplicantIncome, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7310 -1.4821 0.8210 0.9007 0.9007
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.927e-01 1.288e-01 5.379 7.5e-08 ***
## CoapplicantIncome 1.042e-04 7.027e-05 1.483 0.138
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 639.51 on 520 degrees of freedom
## AIC: 643.51
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ LoanAmount, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ LoanAmount, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6031 -1.5224 0.8424 0.8568 0.9001
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.971599 0.308215 3.152 0.00162 **
## LoanAmount -0.001175 0.002351 -0.500 0.61718
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 641.51 on 520 degrees of freedom
## AIC: 645.51
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Loan_Amount_Term, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Loan_Amount_Term, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6623 -1.5352 0.8577 0.8577 0.8956
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1236351 0.5179354 2.169 0.030 *
## Loan_Amount_Term -0.0008693 0.0014812 -0.587 0.557
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 641.41 on 520 degrees of freedom
## AIC: 645.41
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Credit_History, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Credit_History, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8068 -0.3664 0.6596 0.6596 2.3385
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6672 0.4625 -5.767 8.06e-09 ***
## Credit_History 4.0819 0.4777 8.545 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 476.77 on 520 degrees of freedom
## AIC: 480.77
##
## Number of Fisher Scoring iterations: 5
summary(glm(Loan_Status ~ Urban, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Urban, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5706 -1.4823 0.8299 0.8299 0.9005
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8890 0.1165 7.632 2.31e-14 ***
## Urban -0.1959 0.2021 -0.969 0.332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 640.82 on 520 degrees of freedom
## AIC: 644.82
##
## Number of Fisher Scoring iterations: 4
summary(glm(Loan_Status ~ Rural, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Rural, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6137 -1.3916 0.7968 0.7968 0.9774
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9846 0.1178 8.359 <2e-16 ***
## Rural -0.4940 0.2019 -2.447 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 635.84 on 520 degrees of freedom
## AIC: 639.84
##
## Number of Fisher Scoring iterations: 4
summary(glm(formula=Loan_Status ~ Married + Education + CoapplicantIncome + Credit_History + Rural, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + CoapplicantIncome +
## Credit_History + Rural, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1357 -0.3367 0.5501 0.6936 2.6328
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.263e+00 5.423e-01 -6.017 1.78e-09 ***
## Married 4.288e-01 2.469e-01 1.736 0.0825 .
## Education 4.215e-01 2.613e-01 1.613 0.1067
## CoapplicantIncome 1.313e-04 9.166e-05 1.432 0.1521
## Credit_History 4.143e+00 4.838e-01 8.564 < 2e-16 ***
## Rural -5.927e-01 2.482e-01 -2.388 0.0169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 461.17 on 516 degrees of freedom
## AIC: 473.17
##
## Number of Fisher Scoring iterations: 5
#step2 Conduct forward stepwise selection with remaining predictors, usually using a more stringent cut-off, such as p-value<0.1 or perhaps AIC/BIC.
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History + Rural, family=binomial, data=loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0279 -0.3518 0.5233 0.6631 2.5710
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1672 0.5365 -5.903 3.57e-09 ***
## Married 0.5165 0.2389 2.162 0.0306 *
## Education 0.4509 0.2603 1.732 0.0832 .
## Credit_History 4.1191 0.4820 8.546 < 2e-16 ***
## Rural -0.5514 0.2455 -2.246 0.0247 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 463.30 on 517 degrees of freedom
## AIC: 473.3
##
## Number of Fisher Scoring iterations: 5
#step3 Consider adding in any variables that were not included in the model after Step 1 or Step 2. A predictor can be added in even if p-value>0.1 if the AIC/BIC is lower or if it changes the estimated β coefficients by at least, say, 10%.
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban, family=binomial, data = loan_data_R)) # chnage rural by at 50%
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1969 -0.2944 0.5423 0.6690 2.5945
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8709 0.5516 -5.204 1.95e-07 ***
## Married 0.5246 0.2406 2.180 0.02923 *
## Education 0.4767 0.2632 1.811 0.07011 .
## Credit_History 4.1890 0.4883 8.579 < 2e-16 ***
## Rural -0.9362 0.2979 -3.142 0.00168 **
## Urban -0.7706 0.3011 -2.559 0.01050 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 456.59 on 516 degrees of freedom
## AIC: 468.59
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Semiurban, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Semiurban, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1969 -0.2944 0.5423 0.6690 2.5945
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6415 0.5797 -6.282 3.35e-10 ***
## Married 0.5246 0.2406 2.180 0.0292 *
## Education 0.4767 0.2632 1.811 0.0701 .
## Credit_History 4.1890 0.4883 8.579 < 2e-16 ***
## Rural -0.1656 0.2812 -0.589 0.5559
## Semiurban 0.7706 0.3011 2.559 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 456.59 on 516 degrees of freedom
## AIC: 468.59
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Loan_Amount_Term, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Loan_Amount_Term, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0434 -0.3515 0.5253 0.6644 2.5703
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0398138 0.8326495 -3.651 0.000261 ***
## Married 0.5123867 0.2398351 2.136 0.032646 *
## Education 0.4547076 0.2610463 1.742 0.081532 .
## Credit_History 4.1169297 0.4819874 8.542 < 2e-16 ***
## Rural -0.5472129 0.2463555 -2.221 0.026335 *
## Loan_Amount_Term -0.0003705 0.0018553 -0.200 0.841729
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 463.26 on 516 degrees of freedom
## AIC: 475.26
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + LoanAmount, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + LoanAmount, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1061 -0.3487 0.5358 0.6623 2.5788
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.886694 0.611223 -4.723 2.33e-06 ***
## Married 0.565866 0.245036 2.309 0.0209 *
## Education 0.489188 0.264156 1.852 0.0640 .
## Credit_History 4.125726 0.482439 8.552 < 2e-16 ***
## Rural -0.519677 0.247802 -2.097 0.0360 *
## LoanAmount -0.002857 0.003028 -0.943 0.3455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 462.40 on 516 degrees of freedom
## AIC: 474.4
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + CoapplicantIncome, family=binomial, data = loan_data_R)) #change married by aroudn 18%
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + CoapplicantIncome, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1357 -0.3367 0.5501 0.6936 2.6328
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.263e+00 5.423e-01 -6.017 1.78e-09 ***
## Married 4.288e-01 2.469e-01 1.736 0.0825 .
## Education 4.215e-01 2.613e-01 1.613 0.1067
## Credit_History 4.143e+00 4.838e-01 8.564 < 2e-16 ***
## Rural -5.927e-01 2.482e-01 -2.388 0.0169 *
## CoapplicantIncome 1.313e-04 9.166e-05 1.432 0.1521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 461.17 on 516 degrees of freedom
## AIC: 473.17
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + ApplicantIncome, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + ApplicantIncome, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0528 -0.3545 0.5261 0.6632 2.5762
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.106e+00 5.746e-01 -5.406 6.43e-08 ***
## Married 5.164e-01 2.390e-01 2.161 0.0307 *
## Education 4.619e-01 2.632e-01 1.755 0.0792 .
## Credit_History 4.125e+00 4.824e-01 8.549 < 2e-16 ***
## Rural -5.434e-01 2.471e-01 -2.199 0.0279 *
## ApplicantIncome -1.902e-05 6.506e-05 -0.292 0.7700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 463.21 on 516 degrees of freedom
## AIC: 475.21
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Self_Employed, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Self_Employed, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0430 -0.3561 0.5147 0.6537 2.5629
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1500 0.5367 -5.870 4.37e-09 ***
## Married 0.5199 0.2391 2.174 0.0297 *
## Education 0.4516 0.2603 1.735 0.0828 .
## Credit_History 4.1330 0.4828 8.561 < 2e-16 ***
## Rural -0.5476 0.2457 -2.228 0.0259 *
## Self_Employed -0.2762 0.3492 -0.791 0.4290
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 462.69 on 516 degrees of freedom
## AIC: 474.69
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Dependents, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Dependents, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0447 -0.3581 0.5288 0.6624 2.5682
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.14722 0.53957 -5.833 5.45e-09 ***
## Married 0.54758 0.25908 2.114 0.0346 *
## Education 0.44203 0.26163 1.690 0.0911 .
## Credit_History 4.11596 0.48196 8.540 < 2e-16 ***
## Rural -0.55483 0.24578 -2.257 0.0240 *
## Dependents -0.04106 0.13060 -0.314 0.7532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 463.20 on 516 degrees of freedom
## AIC: 475.2
##
## Number of Fisher Scoring iterations: 5
# we adding back semiurban first
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban, family=binomial, data = loan_data_R)) # chnage rural by more than 50%
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1969 -0.2944 0.5423 0.6690 2.5945
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8709 0.5516 -5.204 1.95e-07 ***
## Married 0.5246 0.2406 2.180 0.02923 *
## Education 0.4767 0.2632 1.811 0.07011 .
## Credit_History 4.1890 0.4883 8.579 < 2e-16 ***
## Rural -0.9362 0.2979 -3.142 0.00168 **
## Urban -0.7706 0.3011 -2.559 0.01050 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 456.59 on 516 degrees of freedom
## AIC: 468.59
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome, family=binomial, data = loan_data_R)) #chnage married about 17% and rural about 30%
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome, family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2741 -0.3060 0.5240 0.6792 2.6515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.959e+00 5.570e-01 -5.313 1.08e-07 ***
## Married 4.394e-01 2.487e-01 1.767 0.07728 .
## Education 4.486e-01 2.641e-01 1.698 0.08943 .
## Credit_History 4.204e+00 4.891e-01 8.597 < 2e-16 ***
## Rural -9.742e-01 3.003e-01 -3.244 0.00118 **
## Urban -7.634e-01 3.015e-01 -2.533 0.01132 *
## CoapplicantIncome 1.280e-04 9.235e-05 1.386 0.16573
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 454.60 on 515 degrees of freedom
## AIC: 468.6
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount, family=binomial, data = loan_data_R)) #chnage over 10%
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2628 -0.3130 0.5124 0.6822 2.6779
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.462e+00 6.382e-01 -3.858 0.000114 ***
## Married 5.058e-01 2.533e-01 1.997 0.045843 *
## Education 4.996e-01 2.676e-01 1.867 0.061942 .
## Credit_History 4.214e+00 4.892e-01 8.613 < 2e-16 ***
## Rural -9.516e-01 3.012e-01 -3.159 0.001581 **
## Urban -8.112e-01 3.039e-01 -2.669 0.007608 **
## CoapplicantIncome 1.608e-04 9.459e-05 1.699 0.089227 .
## LoanAmount -4.948e-03 3.150e-03 -1.571 0.116229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 452.12 on 514 degrees of freedom
## AIC: 468.12
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount + Dependents, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount + Dependents,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2625 -0.3135 0.5114 0.6830 2.6770
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.459e+00 6.404e-01 -3.840 0.000123 ***
## Married 5.121e-01 2.762e-01 1.854 0.063757 .
## Education 4.982e-01 2.687e-01 1.854 0.063767 .
## Credit_History 4.213e+00 4.894e-01 8.608 < 2e-16 ***
## Rural -9.526e-01 3.017e-01 -3.158 0.001590 **
## Urban -8.115e-01 3.040e-01 -2.669 0.007598 **
## CoapplicantIncome 1.600e-04 9.559e-05 1.673 0.094262 .
## LoanAmount -4.934e-03 3.160e-03 -1.561 0.118423
## Dependents -7.637e-03 1.342e-01 -0.057 0.954607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 452.11 on 513 degrees of freedom
## AIC: 470.11
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount + ApplicantIncome, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount + ApplicantIncome,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2676 -0.3163 0.5071 0.6855 2.6864
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.554e+00 6.465e-01 -3.951 7.78e-05 ***
## Married 5.092e-01 2.546e-01 2.000 0.04549 *
## Education 4.621e-01 2.697e-01 1.713 0.08666 .
## Credit_History 4.196e+00 4.891e-01 8.580 < 2e-16 ***
## Rural -9.849e-01 3.031e-01 -3.249 0.00116 **
## Urban -8.128e-01 3.043e-01 -2.671 0.00757 **
## CoapplicantIncome 2.284e-04 1.129e-04 2.022 0.04316 *
## LoanAmount -7.751e-03 3.991e-03 -1.942 0.05209 .
## ApplicantIncome 1.025e-04 8.779e-05 1.167 0.24302
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 450.71 on 513 degrees of freedom
## AIC: 468.71
##
## Number of Fisher Scoring iterations: 5
#the following are just roughtly change near 10% but p valur is very large we just ingore
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount + Self_Employed, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount + Self_Employed,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2759 -0.3131 0.5116 0.6847 2.6676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.464e+00 6.386e-01 -3.859 0.000114 ***
## Married 5.078e-01 2.535e-01 2.003 0.045169 *
## Education 4.981e-01 2.677e-01 1.861 0.062781 .
## Credit_History 4.226e+00 4.901e-01 8.623 < 2e-16 ***
## Rural -9.544e-01 3.015e-01 -3.165 0.001549 **
## Urban -8.239e-01 3.048e-01 -2.703 0.006870 **
## CoapplicantIncome 1.543e-04 9.482e-05 1.627 0.103752
## LoanAmount -4.682e-03 3.175e-03 -1.475 0.140219
## Self_Employed -2.444e-01 3.601e-01 -0.679 0.497322
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 451.67 on 513 degrees of freedom
## AIC: 469.67
##
## Number of Fisher Scoring iterations: 5
#4Attempt adding plausible interactions among variables in the model,considering interactions as follows does not produce anything extra
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Credit_History, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Credit_History,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2495 -0.2977 0.5147 0.6857 2.6780
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.435e+00 1.887e+00 -2.351 0.01875 *
## Married 5.121e-01 2.543e-01 2.014 0.04402 *
## Education 5.217e-01 2.681e-01 1.946 0.05164 .
## Credit_History 6.273e+00 1.908e+00 3.288 0.00101 **
## Rural -9.429e-01 3.008e-01 -3.135 0.00172 **
## Urban -8.127e-01 3.046e-01 -2.668 0.00763 **
## CoapplicantIncome 1.496e-04 9.406e-05 1.591 0.11165
## LoanAmount 9.990e-03 1.300e-02 0.769 0.44218
## Credit_History:LoanAmount -1.570e-02 1.328e-02 -1.182 0.23732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 450.73 on 513 degrees of freedom
## AIC: 468.73
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Education, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + CoapplicantIncome + LoanAmount + LoanAmount:Education,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2661 -0.3124 0.5107 0.6849 2.6760
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.229e+00 9.291e-01 -2.399 0.01644 *
## Married 5.133e-01 2.541e-01 2.020 0.04338 *
## Education 1.961e-01 9.171e-01 0.214 0.83067
## Credit_History 4.204e+00 4.897e-01 8.586 < 2e-16 ***
## Rural -9.477e-01 3.017e-01 -3.142 0.00168 **
## Urban -7.999e-01 3.056e-01 -2.618 0.00885 **
## CoapplicantIncome 1.596e-04 9.474e-05 1.685 0.09201 .
## LoanAmount -6.917e-03 6.508e-03 -1.063 0.28786
## Education:LoanAmount 2.521e-03 7.274e-03 0.346 0.72897
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 452.00 on 513 degrees of freedom
## AIC: 470
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural+ Urban + LoanAmount +CoapplicantIncome + Married:Credit_History, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + LoanAmount + CoapplicantIncome + Married:Credit_History,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2706 -0.3055 0.5095 0.6820 2.5214
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.034e+00 8.471e-01 -2.401 0.01636 *
## Married -1.363e-01 9.628e-01 -0.142 0.88741
## Education 4.919e-01 2.688e-01 1.830 0.06726 .
## Credit_History 3.765e+00 7.689e-01 4.897 9.75e-07 ***
## Rural -9.492e-01 3.016e-01 -3.147 0.00165 **
## Urban -8.100e-01 3.038e-01 -2.666 0.00767 **
## LoanAmount -4.963e-03 3.153e-03 -1.574 0.11554
## CoapplicantIncome 1.645e-04 9.503e-05 1.731 0.08340 .
## Married:Credit_History 6.798e-01 9.880e-01 0.688 0.49142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 451.66 on 513 degrees of freedom
## AIC: 469.66
##
## Number of Fisher Scoring iterations: 5
summary(glm(formula=Loan_Status ~ Married + Education + Credit_History
+ Rural+ Urban + LoanAmount +CoapplicantIncome + LoanAmount:CoapplicantIncome, family=binomial, data = loan_data_R))
##
## Call:
## glm(formula = Loan_Status ~ Married + Education + Credit_History +
## Rural + Urban + LoanAmount + CoapplicantIncome + LoanAmount:CoapplicantIncome,
## family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4184 -0.3083 0.4902 0.6895 2.6676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.839e+00 6.865e-01 -4.135 3.55e-05 ***
## Married 4.746e-01 2.554e-01 1.858 0.06317 .
## Education 5.250e-01 2.699e-01 1.945 0.05173 .
## Credit_History 4.212e+00 4.899e-01 8.598 < 2e-16 ***
## Rural -9.738e-01 3.023e-01 -3.222 0.00127 **
## Urban -8.372e-01 3.055e-01 -2.740 0.00614 **
## LoanAmount -1.859e-03 3.764e-03 -0.494 0.62137
## CoapplicantIncome 6.299e-04 3.346e-04 1.883 0.05977 .
## LoanAmount:CoapplicantIncome -3.303e-06 2.217e-06 -1.490 0.13623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 449.92 on 513 degrees of freedom
## AIC: 467.92
##
## Number of Fisher Scoring iterations: 5
# thus, the finla model included these presictors
# Married, Credit_History, Urban, Rural,Education, CoapplicantIncome, LoanAmount
hand_best_model_dataset <-dplyr::select(loan_data_R, Married, Credit_History, Urban, Rural,Education, CoapplicantIncome, Loan_Status, LoanAmount)
bestModel <- glm(Loan_Status ~ Married + Credit_History + Urban + Rural
+ Education + CoapplicantIncome + LoanAmount, family = binomial, data = loan_data_R)
summary(bestModel)
##
## Call:
## glm(formula = Loan_Status ~ Married + Credit_History + Urban +
## Rural + Education + CoapplicantIncome + LoanAmount, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2628 -0.3130 0.5124 0.6822 2.6779
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.462e+00 6.382e-01 -3.858 0.000114 ***
## Married 5.058e-01 2.533e-01 1.997 0.045843 *
## Credit_History 4.214e+00 4.892e-01 8.613 < 2e-16 ***
## Urban -8.112e-01 3.039e-01 -2.669 0.007608 **
## Rural -9.516e-01 3.012e-01 -3.159 0.001581 **
## Education 4.996e-01 2.676e-01 1.867 0.061942 .
## CoapplicantIncome 1.608e-04 9.459e-05 1.699 0.089227 .
## LoanAmount -4.948e-03 3.150e-03 -1.571 0.116229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 452.12 on 514 degrees of freedom
## AIC: 468.12
##
## Number of Fisher Scoring iterations: 5
cor(hand_best_model_dataset$Loan_Status, fitted(bestModel)) # R=0.6 is useful for comparing fits of different models for the same data.
## [1] 0.5973614
#ROC
rocBestModel <- roc(hand_best_model_dataset$Loan_Status ~ fitted(bestModel))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(rocBestModel, print.auc = TRUE)
auc(rocBestModel) #0.81
## Area under the curve: 0.8108
#hoslem.test
hoslem.test(loan_data_R$Loan_Status, fitted(bestModel)) #0.06623
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: loan_data_R$Loan_Status, fitted(bestModel)
## X-squared = 14.654, df = 8, p-value = 0.06623
#Confusion Matrix
fit123 = predict(bestModel, newdata = loan_data_R, type = 'response')
# If results are more than 50% then convert to 1 else 0
fit123 = ifelse(fit123 >=0.5,1,0) #Setting cut-off to be at 0.5
# Evaluate predictions on the training dataset through Confusion Matrix
cnf_matrix = table(predicted = fit123, actual = loan_data_R$Loan_Status)
cnf_matrix
## actual
## predicted 0 1
## 0 72 5
## 1 87 358
TN = cnf_matrix[1,1] # True Negative - Actual & Predicted is 0/N
TP = cnf_matrix[2,2] # True Positive - Actual & Predicted is 1/Y
FP = cnf_matrix[2,1] # False Positive - Actual is 0/N but Predicted is 1/Y
FN = cnf_matrix[1,2] # False Nefgative - Actual is 1/Y but Predicted is 0/N
TO = TN+TP+FP+FN # Total Observations
accuracy = (TP+TN)/TO # Accuracy or Prevalance of Confusion Matrix = 0.82
accuracy
## [1] 0.8237548
precision = TP/(TP+FP) # Precision = 0.80
precision
## [1] 0.8044944
sensitivity = TP/(TP+FN) # True Positive Rate = 0.98
sensitivity
## [1] 0.9862259
error = (FP+FN)/TO # Error Rate=0.18
error
## [1] 0.1762452
specificity = TN/(TN+FP)
specificity #0.45
## [1] 0.4528302
G=sqrt(specificity*sensitivity) ##G-mean=0.67
G
## [1] 0.668276
####################################################################################
#Aautomatic backward selection
summary(reg1)
##
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2938 -0.3139 0.5005 0.6809 2.6244
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.541e+00 9.550e-01 -2.660 0.007806 **
## Gender 3.093e-01 3.257e-01 0.950 0.342319
## Married 4.423e-01 2.920e-01 1.515 0.129762
## Dependents -2.877e-02 1.366e-01 -0.211 0.833221
## Education 4.837e-01 2.740e-01 1.766 0.077460 .
## Self_Employed -3.218e-01 3.669e-01 -0.877 0.380433
## ApplicantIncome 1.019e-04 8.949e-05 1.139 0.254630
## CoapplicantIncome 2.072e-04 1.152e-04 1.799 0.072011 .
## LoanAmount -7.525e-03 4.023e-03 -1.871 0.061383 .
## Loan_Amount_Term -4.842e-04 1.937e-03 -0.250 0.802634
## Credit_History 4.215e+00 4.910e-01 8.585 < 2e-16 ***
## Urban -8.709e-01 3.097e-01 -2.812 0.004920 **
## Rural -1.012e+00 3.054e-01 -3.315 0.000917 ***
## Semiurban NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 449.00 on 509 degrees of freedom
## AIC: 475
##
## Number of Fisher Scoring iterations: 5
reg2 <- update(reg1,.~.-Semiurban)
summary(reg2)
##
## Call:
## glm(formula = Loan_Status ~ Gender + Married + Dependents + Education +
## Self_Employed + ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Urban + Rural, family = binomial,
## data = loan_data_R)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2938 -0.3139 0.5005 0.6809 2.6244
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.541e+00 9.550e-01 -2.660 0.007806 **
## Gender 3.093e-01 3.257e-01 0.950 0.342319
## Married 4.423e-01 2.920e-01 1.515 0.129762
## Dependents -2.877e-02 1.366e-01 -0.211 0.833221
## Education 4.837e-01 2.740e-01 1.766 0.077460 .
## Self_Employed -3.218e-01 3.669e-01 -0.877 0.380433
## ApplicantIncome 1.019e-04 8.949e-05 1.139 0.254630
## CoapplicantIncome 2.072e-04 1.152e-04 1.799 0.072011 .
## LoanAmount -7.525e-03 4.023e-03 -1.871 0.061383 .
## Loan_Amount_Term -4.842e-04 1.937e-03 -0.250 0.802634
## Credit_History 4.215e+00 4.910e-01 8.585 < 2e-16 ***
## Urban -8.709e-01 3.097e-01 -2.812 0.004920 **
## Rural -1.012e+00 3.054e-01 -3.315 0.000917 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641.76 on 521 degrees of freedom
## Residual deviance: 449.00 on 509 degrees of freedom
## AIC: 475
##
## Number of Fisher Scoring iterations: 5
stepAIC(reg2, direction = "backward")
## Start: AIC=475
## Loan_Status ~ Gender + Married + Dependents + Education + Self_Employed +
## ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Urban + Rural
##
## Df Deviance AIC
## - Dependents 1 449.05 473.05
## - Loan_Amount_Term 1 449.07 473.07
## - Self_Employed 1 449.75 473.75
## - Gender 1 449.89 473.89
## - ApplicantIncome 1 450.33 474.33
## <none> 449.00 475.00
## - Married 1 451.29 475.29
## - Education 1 452.06 476.06
## - CoapplicantIncome 1 452.44 476.44
## - LoanAmount 1 452.60 476.60
## - Urban 1 457.15 481.15
## - Rural 1 460.41 484.41
## - Credit_History 1 612.00 636.00
##
## Step: AIC=473.05
## Loan_Status ~ Gender + Married + Education + Self_Employed +
## ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Urban + Rural
##
## Df Deviance AIC
## - Loan_Amount_Term 1 449.10 471.10
## - Self_Employed 1 449.80 471.80
## - Gender 1 449.91 471.91
## - ApplicantIncome 1 450.36 472.36
## <none> 449.05 473.05
## - Married 1 451.40 473.40
## - Education 1 452.19 474.19
## - CoapplicantIncome 1 452.61 474.61
## - LoanAmount 1 452.68 474.68
## - Urban 1 457.16 479.16
## - Rural 1 460.41 482.41
## - Credit_History 1 612.38 634.38
##
## Step: AIC=471.1
## Loan_Status ~ Gender + Married + Education + Self_Employed +
## ApplicantIncome + CoapplicantIncome + LoanAmount + Credit_History +
## Urban + Rural
##
## Df Deviance AIC
## - Self_Employed 1 449.84 469.84
## - Gender 1 449.99 469.99
## - ApplicantIncome 1 450.53 470.53
## <none> 449.10 471.10
## - Married 1 451.54 471.54
## - Education 1 452.19 472.19
## - CoapplicantIncome 1 452.74 472.74
## - LoanAmount 1 452.92 472.92
## - Urban 1 457.16 477.16
## - Rural 1 460.52 480.52
## - Credit_History 1 612.51 632.51
##
## Step: AIC=469.84
## Loan_Status ~ Gender + Married + Education + ApplicantIncome +
## CoapplicantIncome + LoanAmount + Credit_History + Urban +
## Rural
##
## Df Deviance AIC
## - Gender 1 450.71 468.71
## - ApplicantIncome 1 451.01 469.01
## <none> 449.84 469.84
## - Married 1 452.26 470.26
## - Education 1 453.01 471.01
## - CoapplicantIncome 1 453.51 471.51
## - LoanAmount 1 453.72 471.72
## - Urban 1 457.64 475.64
## - Rural 1 461.13 479.13
## - Credit_History 1 612.67 630.67
##
## Step: AIC=468.71
## Loan_Status ~ Married + Education + ApplicantIncome + CoapplicantIncome +
## LoanAmount + Credit_History + Urban + Rural
##
## Df Deviance AIC
## - ApplicantIncome 1 452.12 468.12
## <none> 450.71 468.71
## - Education 1 453.59 469.59
## - LoanAmount 1 454.60 470.60
## - Married 1 454.69 470.69
## - CoapplicantIncome 1 455.10 471.10
## - Urban 1 458.03 474.03
## - Rural 1 461.66 477.66
## - Credit_History 1 613.48 629.48
##
## Step: AIC=468.12
## Loan_Status ~ Married + Education + CoapplicantIncome + LoanAmount +
## Credit_History + Urban + Rural
##
## Df Deviance AIC
## <none> 452.12 468.12
## - LoanAmount 1 454.60 468.60
## - CoapplicantIncome 1 455.12 469.12
## - Education 1 455.52 469.52
## - Married 1 456.08 470.08
## - Urban 1 459.42 473.42
## - Rural 1 462.44 476.44
## - Credit_History 1 616.89 630.89
##
## Call: glm(formula = Loan_Status ~ Married + Education + CoapplicantIncome +
## LoanAmount + Credit_History + Urban + Rural, family = binomial,
## data = loan_data_R)
##
## Coefficients:
## (Intercept) Married Education
## -2.4622823 0.5058005 0.4995745
## CoapplicantIncome LoanAmount Credit_History
## 0.0001608 -0.0049479 4.2136553
## Urban Rural
## -0.8111698 -0.9516449
##
## Degrees of Freedom: 521 Total (i.e. Null); 514 Residual
## Null Deviance: 641.8
## Residual Deviance: 452.1 AIC: 468.1
x<-cbind(loan_data_R$Married, loan_data_R$Education, loan_data_R$CoapplicantIncome,
loan_data_R$LoanAmount, loan_data_R$Credit_History, loan_data_R$Urban, loan_data_R$Rural,
loan_data_R$Gender, loan_data_R$Dependents, loan_data_R$Self_Employed, loan_data_R$ApplicantIncome, loan_data_R$Loan_Amount_Term +loan_data_R$Semiurban) # Combine variables by column
y<-loan_data_R$Loan_Status
grid<-10^seq(10,-2, length=100) # Create a grid of lambda values
lasso.mod=cv.glmnet(x,y,lambda=grid, # Build a CV lasso regression
nfold=length(y), # nfold=sample size, leave-one-out CV
alpha=1) # alpha=0, lasso reg is fit
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations
## per fold
## Ignore the received warning which recommends leaving 3-or-more out in CV ##
#Warning message:
#Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(log10(lasso.mod$lambda), lasso.mod$cvm, # Plot average CV error versus log(lambda)
xlab="log10(Lambda)", ylab="CV Error")
abline(v = log10(lasso.mod$lambda.min), lty = 3)
(lambda=lasso.mod$lambda.min) # The lambda that minimizes CV error
## [1] 0.01
predict(lasso.mod,s=lambda, # Obtain lasso reg coefs
type="coefficients")
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.0784336732
## V1 0.0435157043
## V2 0.0502408734
## V3 0.0000123858
## V4 -0.0002629468
## V5 0.7011271760
## V6 -0.0615330894
## V7 -0.0940765495
## V8 0.0229686712
## V9 .
## V10 -0.0074614315
## V11 .
## V12 .
#Confusion Matrix
fit1234 = predict(lasso.mod, newx = x, type = 'response')
# If results are more than 50% then convert to 1 else 0
fit1234 = ifelse(fit1234 >=0.5,1,0) #Setting cut-off to be at 0.5
# Evaluate predictions on the training dataset through Confusion Matrix
cnf_matrix2 = table(predicted = fit1234, actual = loan_data_R$Loan_Status)
cnf_matrix2
## actual
## predicted 0 1
## 0 72 5
## 1 87 358
TN2 = cnf_matrix2[1,1] # True Negative - Actual & Predicted is 0/N
TP2 = cnf_matrix2[2,2] # True Positive - Actual & Predicted is 1/Y
FP2 = cnf_matrix2[2,1] # False Positive - Actual is 0/N but Predicted is 1/Y
FN2 = cnf_matrix2[1,2] # False Nefgative - Actual is 1/Y but Predicted is 0/N
TO2 = TN2+TP2+FP2+FN2 # Total Observations
accuracy2 = (TP2+TN2)/TO2 # Accuracy or Prevalance of Confusion Matrix
accuracy2
## [1] 0.8237548
precision2 = TP2/(TP2+FP2)
precision2
## [1] 0.8044944
sensitivity2 = TP2/(TP2+FN2)
sensitivity2
## [1] 0.9862259
error2 = (FP2+FN2)/TO2
error2
## [1] 0.1762452
specificity2 = TN2/(TN2+FP2)
specificity2
## [1] 0.4528302
#G-mean
G2=sqrt(specificity2*sensitivity2)
G2
## [1] 0.668276